Setup

library(SeqArray)
library(SNPRelate)
library(pander)
library(scales)
library(magrittr)
library(tidyverse)
library(readxl)
library(sp)
library(ggmap)
library(rgdal)
library(ggsn)
library(parallel)
library(qqman)
theme_set(theme_bw())
mc <- min(12, detectCores() - 1)
alpha <- 0.05
w <- 169/25.4
h <- 169/25.4
gdsPath <- file.path("..", "5_stacks", "gds", "populations.snps.gds")
gdsFile <- seqOpen(gdsPath, readonly = TRUE)
keepSNPs <- readRDS("keepSNPsAfterLDPruning.RDS")
sampleID <- tibble(
    Sample = seqGetData(gdsFile, "sample.annotation/Sample"),
    Population = seqGetData(gdsFile, "sample.annotation/Population"),
    Location = seqGetData(gdsFile, "sample.annotation/Location")
)
popSizes <- sampleID %>% 
    group_by(Population) %>%
    summarise(n = n())
genotypes <- tibble(
    variant.id = seqGetData(gdsFile, "variant.id") ,
    chromosome = seqGetData(gdsFile, "chromosome"),
    position = seqGetData(gdsFile, "position")
) %>%
    cbind(seqGetData(gdsFile, "genotype") %>% 
              apply(MARGIN = 3, colSums) %>%
              t %>%
              set_colnames(sampleID$Sample)) %>%
    as_tibble() %>%
    right_join(keepSNPs) %>%
    gather(Sample, Genotype, -variant.id, -chromosome, -position) %>%
    dplyr::filter(!is.na(Genotype)) %>%
    arrange(variant.id, Sample) %>%
    left_join(sampleID)

This workflow immediately follows 03_SNPFiltering and performs an analysis on the SNPs retained after filtering. Analysis performed is:

PCA

lowCall <- c("gc2901", "gc2776", "gc2731", "gc2727", "gc2686")
snp4PCA <- genotypes %>% 
    filter(!Sample %in% lowCall) %>%
    group_by(variant.id, Population) %>% 
    summarise(n = n()) %>% 
    spread(Population, n) %>%
    mutate(N = `1996` + `2010` + `2012`) %>% 
    ungroup() %>%
    filter(N > 0.95*(sum(popSizes$n) - length(lowCall)))
pca <- genotypes %>%
    filter(variant.id %in% snp4PCA$variant.id) %>%
    dplyr::select(variant.id, Sample, Genotype) %>%
    spread(Sample, Genotype) %>%
    as.data.frame() %>%
    column_to_rownames("variant.id") %>%
    as.matrix() %>%
    apply(2, function(x){
        x[is.na(x)] <- mean(x, na.rm = TRUE)
        x
    }) %>%
    t() %>%
    .[, apply(., 2, function(x){length(unique(x)) > 1})] %>%
    prcomp( center = TRUE)

As noted in the previous section sample samples gc2901, gc2776, gc2731, gc2727 and gc2686 had a SNP identification rate \(< 50\)% and as such these were markd as potential outliers. Ignoring these samples, and restricting data to SNPs identified in \(>95\)% of all samples, a preliminary PCA was performed This amounted to 7,768 of the possible 18,886 SNPs for analysis using PCA. Missing values were specified as the mean MAF over all populations combined.

Given the initially observed structure, in which samples from the 2012 population are separating from the other samples which group with the 1996 population, the collection points for the 2012 samples as investigated.

sampleID %<>%
    left_join(
        file.path("..", "external", "GPS_Locations.xlsx") %>%
            read_excel() %>%
            dplyr::select(Sample, ends_with("tude")) %>%
            mutate(Sample = gsub("[Oo][Rr][Aa] ([0-9ABC]*)", "ora\\1", Sample))
    )
*PCA showing population structure. Point size reflects the proportion of SNPs for which imputation was required, and the observed structure appeared independent of this.*

PCA showing population structure. Point size reflects the proportion of SNPs for which imputation was required, and the observed structure appeared independent of this.

loc <- c(range(sampleID$Longitude, na.rm = TRUE) %>% mean,
         range(sampleID$Latitude, na.rm = TRUE) %>% mean)
saPoly <- readRDS("saPoly.RDS")
roads <- readRDS("roads.RDS")
gc <- SpatialPoints(cbind(x = 138.655972, y = -31.200305))
proj4string(gc) <- "+proj=longlat +ellps=GRS80 +no_defs"
xBreaks <- seq(138.65, 138.8, by = 0.05)
xLabs <- parse(text = paste(xBreaks, "*degree ~ E", sep = ""))
yBreaks <- seq(-31.2, -31.32, by = -0.04)
yLabs <- parse(text = paste(-yBreaks, "*degree ~ S", sep = ""))
leftN <- tibble(x = c(138.7965, 138.8, 138.8) - 0.01,
                    y = c(-31.2, -31.198, -31.193))
rightN <- tibble(x = c(138.8, 138.8, 138.8035) - 0.01,
                     y = c(-31.193, -31.198, -31.2))
ggMap <- get_map(loc, zoom = 12, maptype = "terrain", color = "bw")
zoomPlot <- ggmap(ggMap, extent = "normal", maprange = FALSE) +
    geom_path(
        aes(long, lat, group = group), 
        data = subset(roads, SURFACE == "UNSE"), 
        linetype = 2, size = 0.3) + 
    geom_path(
        aes(long, lat, group = group), 
        data = subset(roads, SURFACE != "UNSE"), 
        linetype = 1, size = 0.4) +
    geom_label(x = 138.74, y = -31.29, label = "Flinders Ranges NP", alpha = 0.4) +
    geom_label(x = 138.72, y = -31.22, label = "Gum Creek", alpha = 0.4) +
    geom_point(aes(x, y), data = as.data.frame(gc), shape = 3, size = 3) +
    geom_text(aes(x, y), label = "HS", data = as.data.frame(gc), nudge_y = 0.005) +
    geom_polygon(
        data = subset(saPoly, F_CODE == "HD"),
        aes(long, lat, group = group),
        fill = rgb(1, 1, 1, 0), colour = "grey10", size = 0.3) +
    geom_polygon(
        data = subset(saPoly, F_CODE == "PARK"),
        aes(long, lat, group = group),
        fill = rgb(1, 1, 1, 0), colour = "grey10", size = 0.2) +
    geom_point(
        data= filter(pca4Plot, grepl("2012", Population)),
        aes(Longitude, Latitude, colour = Population),
        size = 0.9*ps) +
    geom_polygon(aes(x, y), data = leftN, fill = "white", colour = "grey10", size = 0.4) +
    geom_polygon(aes(x, y), data = rightN, fill = "grey10", colour = "grey10", size = 0.4) +
    geom_text(x = 138.79, y = -31.19, label = "N", 
              colour = "grey10", size = 4) +
    scale_colour_manual(values = popCols[2:3]) +
    coord_cartesian(xlim = c(138.618, 138.81),
                    ylim = c(-31.335, -31.18),
                    expand = 0) +
    scale_x_continuous(breaks = xBreaks, labels = xLabs) +
    scale_y_continuous(breaks = yBreaks, labels = yLabs) +
    guides(colour = FALSE) +
    ggsn::scalebar(x.min = 138.618, x.max = 138.81,
                   y.min = -31.335, y.max = -31.18,
                   transform = TRUE,
                   dist = 2, dist_unit = "km",
                   model = 'GRS80',
                   height = 0.012, st.size = 4,
                   location = 'bottomright',
                   anchor = c(x = 138.8, y = -31.328)) +
    labs(x = "Longitude",
         y = "Latitude") +
    theme(text = element_text(size = fs),
          plot.margin = unit(c(1, 1, 1, 1), "mm"))
ausPolygon <- readRDS("ausPolygon.RDS")
ausPts <- SpatialPoints(cbind(x = loc[1], y = loc[2]))
proj4string(ausPts) <- proj4string(ausPolygon)
ausPlot <- ggplot() +
  geom_polygon(data = ausPolygon, 
               aes(long, lat, group = group), fill = "white", colour = "black") + 
  geom_point(data = as.data.frame(ausPts), aes(x, y), size = 1.5) +
  theme_void() +
  theme(plot.background = element_rect(fill = "white", colour = "black"))
## png 
##   2
*Figure 1: Collection points for all 2012 samples with colours showing sub-populations initially defined by PCA analysis and *k*-means clustering.*

Figure 1: Collection points for all 2012 samples with colours showing sub-populations initially defined by PCA analysis and k-means clustering.

zoomLoc <- c(138.753, -31.242)
xBreaks <- seq(138.74, 138.76, by = 0.01)
xLabs <- parse(text = paste(xBreaks, "*degree ~ W", sep = ""))
yBreaks <- seq(-31.235, -31.25, by = -0.005)
yLabs <- parse(text = paste(-yBreaks, "*degree ~ S", sep = ""))
central <- rbind(x = c(138.749, 138.755),
                 y = c(-31.2365, -31.2495)) %>%
  set_colnames(c("min", "max"))
leftN <- tibble(x = c(138.7595, 138.76, 138.76),
                    y = c(-31.234, -31.2335, -31.2325))
rightN <- tibble(x = c(138.76, 138.76, 138.7605),
                     y = c(-31.2325, -31.2335, -31.234))
get_map(zoomLoc, zoom = 15, maptype = "terrain", source = "google", color = "bw") %>%
    ggmap() +
    geom_jitter(
        data = filter(pca4Plot, grepl("2012", Population)), 
        aes(Longitude, Latitude, colour = Population), 
        size = 3, width = 0.0005, height = 0) +
    geom_rect(
        xmin = central["x", "min"],
        xmax = central["x", "max"],
        ymin = central["y", "min"],
        ymax = central["y", "max"],
        fill = "red", alpha = 0.01, colour = "black") +
    geom_polygon(aes(x, y), data = leftN, fill = "white", colour = "grey10", size = 0.4) +
    geom_polygon(aes(x, y), data = rightN, fill = "grey10", colour = "grey10", size = 0.4) +
    geom_text(x = 138.76, y = -31.232, label = "N", 
              colour = "grey10", size = 5) +
    scale_colour_manual(values = popCols[2:3]) +
    scale_x_continuous(breaks = xBreaks, labels = xLabs) +
    scale_y_continuous(breaks = yBreaks, labels = yLabs) +
    theme_bw() +
    guides(colour = FALSE) +
    labs(x = "Longitude",
         y = "Latitude") +
    coord_cartesian(xlim = c(138.74, 138.762),
                    ylim = c(-31.253, -31.231),
                    expand = 0) +
    ggsn::scalebar(
        x.min = 138.74, x.max = 138.762, 
        y.min = -31.253, y.max = -31.231,
        transform = TRUE,
        dist = 0.25, dist_unit = "km",, model = 'GRS80', 
        height = 0.012, st.size = 4,
        location = 'bottomright',
        anchor = c(x = 138.761, y = -31.252)
    )
*Zoomed-in view of the central region for 2012 samples with colours showing sub-populations defined by PCA analysis. The region considered to be the Central Region is shaded in red. Due to overlapping GPS points a small amount of jitter has been added to the x-axis.*

Zoomed-in view of the central region for 2012 samples with colours showing sub-populations defined by PCA analysis. The region considered to be the Central Region is shaded in red. Due to overlapping GPS points a small amount of jitter has been added to the x-axis.

Region Analysis

Removal of SNPs Associated with Collection Region

The structure observed within the 2012 population in the PCA could possibly be explained by recent migration into this region. As the samples collected in the outer regions appeared very similar to the 1996 population in the above plots, this would possibly indicate this a very recent event as the genetic influence of this has not spread through the wider area. Although this may be due to other factors such as sampling bias, this structure was addressed by identifying SNPs which showed an association with the sub-populations identified by PCA analysis. In this way, any candidate SNPs obtained below will be less impacted by this structure, and will be more reflective of the intended variable under study, as opposed to any internal structure of the 2012 population.

oraRegions <- pca4Plot %>% 
  filter(grepl("2012", Population)) %>% 
  rowwise() %>%
  mutate(yCentral = cut(Latitude, breaks = central["y",], include.lowest = TRUE), 
         xCentral = cut(Longitude, breaks = central["x",], include.lowest = TRUE),
         Central = (is.na(yCentral) + is.na(xCentral)) == 0) %>%
  dplyr::select(Sample, Central) 

Testing for Structure in 2012

This model tests:
H0: No association between genotypes and collection region
HA: An association exists between genotypes and collection region

regionResults <- genotypes %>%
    filter(Population == 2012) %>%
    split(f = .$variant.id) %>%
    mclapply(function(x){
        ft <- list(p.value = NA)
        if (length(unique(x$Genotype)) > 1) {
            ft <- x %>%
                left_join(oraRegions) %>%
                group_by(Genotype, Central) %>%
                tally() %>%
                spread(Genotype, n, fill = 0) %>%
                column_to_rownames("Central") %>%
                fisher.test()
        }
        x %>%
            distinct(variant.id, chromosome, position) %>%
            mutate(p = ft$p.value)
    }, mc.cores = mc) %>%
    bind_rows() 

A total of 1684 SNPs were detected as showing a significant association between genotype and the collection region. Under H0, the number expected using α = 0.05 would be 944, and as this number was approximately double that expected, this was taken as evidence of this being a genuine point of concern for this dataset.

Notably, Type II errors were of principle concern in this instance, and as such every SNP with p < 0.05 in the above test was excluded from downstream analysis.

regionSNPs <- filter(regionResults, p < 0.05)$variant.id
saveRDS(regionSNPs, "regionSNPs.RDS")

Under this additional filtering step, the original set of 18886 SNPs will be reduced to 17,108 for testing by genotype and allele frequency.

Verification Of Removal

In order to verify that the removal of the above SNPs removed the undesired population structure from the 2012 population, the above PCA was repeated, excluding the SNPs marked for removal. The previous structure noted in the data was no longer evident, and as such, these SNPs were marked for removal during analysis by genotype and allele frequency.

pcaPost <- genotypes %>%
    filter(variant.id %in% snp4PCA$variant.id,
           !variant.id %in% regionSNPs) %>%
    dplyr::select(variant.id, Sample, Genotype) %>%
    spread(Sample, Genotype) %>%
    as.data.frame() %>%
    column_to_rownames("variant.id") %>%
    as.matrix() %>%
    apply(2, function(x){
        x[is.na(x)] <- mean(x, na.rm = TRUE)
        x
    }) %>%
    t() %>%
    .[, apply(., 2, function(x){length(unique(x)) > 1})] %>%
    prcomp( center = TRUE) 
pcaPost4Plot <- pcaPost$x %>%
    as.data.frame() %>%
    rownames_to_column("Sample") %>%
    as_tibble() %>%
    dplyr::select(Sample, PC1, PC2, PC3) %>%
    left_join(sampleID) %>%
    mutate(Cluster = kmeans(cbind(PC1, PC2, PC3), 3)$cluster) %>%
    group_by(Cluster) %>%
    mutate(maxY = max(Latitude, na.rm = TRUE)) %>%
    ungroup() %>%
    mutate(Population = case_when(
        Population == 1996 ~ "1996",
        Population == 2010 ~ "Outgroup (Turretfield)",
        maxY == max(maxY) ~ "2012 (Outer)",
        maxY != max(maxY) ~ "2012 (Central)"
    )) %>%
    left_join(genotypes %>% 
                  filter(variant.id %in% snp4PCA$variant.id,
                         !Sample %in% lowCall) %>% 
                  group_by(Sample) %>% 
                  tally() %>% 
                  mutate(imputationRate = 1 - n / nrow(snp4PCA))) 
*Figure 2: Principal Components Analysis showing structures before removal of SNPs denoting collection region in the 2012 population, and after removal of these SNPs*

Figure 2: Principal Components Analysis showing structures before removal of SNPs denoting collection region in the 2012 population, and after removal of these SNPs

SNP Analysis

Genotype Frequency Model

This model tests:
H0: No association between genotypes and populations
HA: An association exists between genotypes and populations

genotypeResults <- genotypes %>%
    filter(Population != 2010,
           !variant.id %in% regionSNPs) %>%
    group_by(variant.id, Population, Genotype) %>%
    tally() %>%
    ungroup() %>%
    split(f = .$variant.id) %>%
    mclapply(function(x){
        ft <- list(p.value = NA)
        if (length(unique(x$Genotype)) > 1) {
            ft <- x %>%
                spread(Genotype, n, fill = 0) %>%
                column_to_rownames("Population") %>%
                dplyr::select(-variant.id) %>%
                fisher.test()
        }
        x %>% 
            distinct(variant.id) %>%
            mutate(p = ft$p.value)
    },mc.cores = mc) %>%
    bind_rows() %>%
    filter(!is.na(p)) %>%
    mutate(FDR = p.adjust(p, "fdr"),
           adjP = p.adjust(p, "bonferroni")) %>%
    arrange(p) %>%
    left_join(genotypes %>%
                  distinct(variant.id, chromosome, position)) %>%
    dplyr::select(variant.id, chromosome, position, everything())

Under the full genotype model:

  • 17 genotypes were detected as being significantly associated with the two populations when controlling the FWER at α = 0.05
  • 44 genotypes were detected as being significantly associated with the two populations when controlling the FDR at α = 0.05
  • For the most highly ranked SNP (), the minor allele has been completely lost in the 2012 population
SNPs with adjusted p-values < 0.05 when analysing by genotype.
variant.id chromosome position p FDR adjP
103,513 GL018705 1,937,359 6.103e-08 0.0009039 0.001048
20,241 3 90,851,512 1.053e-07 0.0009039 0.001808
27,266 4 89,602,973 1.865e-07 0.0009841 0.003202
25,051 4 35,111,855 2.587e-07 0.0009841 0.004441
9,251 2 2.8e+07 2.866e-07 0.0009841 0.00492
68,488 13 129,650,867 4.419e-07 0.001052 0.007586
93,489 19 30,689,426 4.838e-07 0.001052 0.008306
139,689 GL019119 147,630 4.901e-07 0.001052 0.008414
142,596 GL019332 5,745 6.122e-07 0.001168 0.01051
33,737 7 35,584,375 9.805e-07 0.001583 0.01683
13,016 2 125,603,458 1.014e-06 0.001583 0.01741
80,891 16 40,958,486 1.232e-06 0.001762 0.02114
102,846 GL018704 1,294,009 1.387e-06 0.001832 0.02382
115,793 GL018751 737,318 1.837e-06 0.002246 0.03154
64,613 13 45,086,624 2.028e-06 0.002246 0.03481
59,189 12 60,113,885 2.094e-06 0.002246 0.03594
69,754 13 138,565,192 2.81e-06 0.002838 0.04825
Figure 3a: Manhattan plot showing results for all SNPs on chromosomes 1:21 for analysis by genotype. The horizontal line indicates the cutoff for an FDR of 5%, with SNPs with an adjusted p-value < 0,05 shown in green

Figure 3a: Manhattan plot showing results for all SNPs on chromosomes 1:21 for analysis by genotype. The horizontal line indicates the cutoff for an FDR of 5%, with SNPs with an adjusted p-value < 0,05 shown in green

Allele Frequency Model

This model tests:
H0: No association between allele frequencies and populations
HA: An association exists between allele frequencies and populations

alleleResults <- genotypes %>%
    filter(Population != 2010,
           !variant.id %in% regionSNPs) %>%
    group_by(variant.id, Population) %>%
    summarise(P = sum(2 - Genotype),
              Q = sum(Genotype)) %>%
    ungroup() %>%
    split(f = .$variant.id) %>%
    mclapply(function(x){
        m <- as.matrix(x[c("P", "Q")])
        ft <- list(p.value = NA) 
        if (length(m) == 4) {
            ft <- fisher.test(m)
        }
        x %>%
            mutate(MAF = Q / (P + Q)) %>%
            dplyr::select(variant.id, Population, MAF) %>%
            spread(Population, MAF) %>%
            mutate(p = ft$p.value)
    },mc.cores = mc) %>%
    bind_rows() %>%
    filter(!is.na(p)) %>%
    rename(MAF_1996 = `1996`,
           MAF_2012 = `2012`) %>%
    mutate(FDR = p.adjust(p, "fdr"),
           adjP = p.adjust(p, "bonferroni")) %>%
    arrange(p) %>%
    left_join(genotypes %>%
                  distinct(variant.id, chromosome, position)) %>%
    dplyr::select(variant.id, chromosome, position, everything())

Under this model:

  • 5 SNP alleles were detected as being significantly associated with the two populations when controlling the FWER at α = 0.05. However, as these SNPs were within 21nt of each other, this may represent the same haplotype
  • 42 SNP alleles were detected as being significantly associated with the two populations when controlling the FDR at α = 0.05
SNPs considered as significant when analysing by genotype using an FDR cutoff of 0.05
variant.id chromosome position MAF_1996 MAF_2012 p FDR adjP
103,513 GL018705 1,937,359 0.2255 0.009615 3.44e-07 0.005913 0.005918
27,266 4 89,602,973 0.1961 0 6.875e-07 0.005913 0.01183
93,489 19 30,689,426 0.1863 0 1.531e-06 0.006677 0.02633
9,251 2 2.8e+07 0.25 0.0122 1.866e-06 0.006677 0.0321
31,004 6 9,037,073 0.4432 0.1277 2.52e-06 0.006677 0.04334
80,891 16 40,958,486 0 0.17 3.045e-06 0.006677 0.05238
25,051 4 35,111,855 0.2907 0.04082 3.086e-06 0.006677 0.05309
13,016 2 125,603,458 0.1915 0 3.105e-06 0.006677 0.05342
68,488 13 129,650,867 0.2708 0.03333 4.172e-06 0.007105 0.07177
59,189 12 60,113,885 0.1633 0 4.749e-06 0.007105 0.08169
102,846 GL018704 1,294,009 0.2065 0.0102 4.865e-06 0.007105 0.08369
135,203 GL018969 165,377 0 0.1818 4.956e-06 0.007105 0.08526
139,689 GL019119 147,630 0.2736 0.04167 5.576e-06 0.007322 0.09592
69,754 13 138,565,192 0 0.1538 5.959e-06 0.007322 0.1025
133,433 GL018933 204,058 0.01754 0.2019 7.749e-06 0.008887 0.1333
33,737 7 35,584,375 0.2841 0.04348 9.725e-06 0.01046 0.1673
115,793 GL018751 737,318 0.2609 0.03261 1.275e-05 0.0123 0.2194
142,596 GL019332 5,745 0.3077 0.05814 1.314e-05 0.0123 0.2261
64,613 13 45,086,624 0.2609 0.03409 1.412e-05 0.0123 0.243
2,227 1 64,635,286 0.186 0 1.43e-05 0.0123 0.246
131,672 GL018907 283,766 0.1932 0.01 1.627e-05 0.01291 0.2799
81,518 16 56,527,308 0.05814 0.3043 1.652e-05 0.01291 0.2841
138,810 GL019077 34,265 0.2232 0.02941 2.279e-05 0.01704 0.392
60,986 12 141,844,290 0.08889 0.3556 2.435e-05 0.01745 0.4189
88,912 18 25,311,176 0.1731 0.4434 2.542e-05 0.01749 0.4373
74,214 14 97,813,263 0.02083 0.2021 4.48e-05 0.02964 0.7707
91,957 19 13,055,154 0.03191 0.234 5.416e-05 0.03373 0.9317
138,713 GL019084 74,038 0.1961 0.02041 5.49e-05 0.03373 0.9444
136,355 GL018985 129,495 0.1702 0.01042 6.686e-05 0.03898 1
136,354 GL018985 123,328 0.3636 0.1275 6.797e-05 0.03898 1
58,800 12 40,509,779 0.3214 0.5943 7.288e-05 0.03931 1
32,502 7 2,702,919 0.2143 0.03774 7.807e-05 0.03931 1
92,678 19 19,178,148 0.22 0.03261 7.861e-05 0.03931 1
62,212 13 2,979,383 0.01818 0.1731 8.051e-05 0.03931 1
122,683 GL018791 957,960 0.163 0.009615 8.094e-05 0.03931 1
78,185 15 87,473,844 0.3511 0.1154 8.416e-05 0.03931 1
77,077 15 43,156,117 0.1531 0.4057 8.594e-05 0.03931 1
131,070 GL018883 283,912 0.3654 0.125 8.683e-05 0.03931 1
140,313 GL019154 883 0.234 0.04082 8.916e-05 0.03933 1
20,241 3 90,851,512 0.4375 0.1667 9.731e-05 0.04185 1
93,211 19 26,020,372 0.1939 0.02083 0.0001019 0.04276 1
25,857 4 62,404,979 0 0.1463 0.0001174 0.04807 1
## png 
##   2
Manhattan plot showing results for all SNPs on chromosomes 1:21 when analysing by allele frequencies. The horizontal line indicates the cutoff for an FDR of 5%, with SNPs considered significant under the Bonferroni adjustment shown in green.

Manhattan plot showing results for all SNPs on chromosomes 1:21 when analysing by allele frequencies. The horizontal line indicates the cutoff for an FDR of 5%, with SNPs considered significant under the Bonferroni adjustment shown in green.